Load libraries

library(dplyr)
library(readr)
library(ggplot2)
library(maps)
library(reshape2)
library(tidyr)
library(purrr)
library(lubridate)
library(maptools)
library(plotly)

Load data

df <- read_csv('./data/database.csv')
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_character(),
##   Time = col_time(format = ""),
##   Type = col_character(),
##   `Magnitude Type` = col_character(),
##   ID = col_character(),
##   Source = col_character(),
##   `Location Source` = col_character(),
##   `Magnitude Source` = col_character(),
##   Status = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 3 parsing failures.
##   row  col   expected                   actual                  file
##  3379 Time time like  1975-02-23T02:58:41.000Z './data/database.csv'
##  7513 Time time like  1985-04-28T02:53:41.530Z './data/database.csv'
## 20651 Time time like  2011-03-13T02:23:34.520Z './data/database.csv'
df_filtered <- df %>%
  filter(!is.na(Time))
df_filtered <- as.data.frame(df_filtered)

Replace column names

colnames(df_filtered) <- c('date', 'time', 'lat', 'lon', 'type', 'depth', 'depth_er', 'depth_ss', 'magnit', 'magnit_type', 'magnit_er', 'magnit_ss', 'azim', 'horiz_dist', 'horiz_er', 'rms', 'id', 'source', 'loc_source', 'magnit_source', 'status')

Convert date to year/month/day format

df_filtered <- df_filtered %>% 
  mutate(date = as_date(parse_date_time(date, orders=c('mdy'))))

# add columns for year, month and day
df_filtered <- df_filtered %>% 
  mutate(year = year(date), month = factor(month(date)), day = factor(day(date)))

Dealing with missing values

Percentage of missing values in each column

round(100*colMeans(is.na(df_filtered)),2)
##          date          time           lat           lon          type 
##          0.00          0.00          0.00          0.00          0.00 
##         depth      depth_er      depth_ss        magnit   magnit_type 
##          0.00         80.95         69.69          0.00          0.01 
##     magnit_er     magnit_ss          azim    horiz_dist      horiz_er 
##         98.60         89.05         68.82         93.15         95.06 
##           rms            id        source    loc_source magnit_source 
##         25.88          0.00          0.00          0.00          0.00 
##        status          year         month           day 
##          0.00          0.00          0.00          0.00

Drop columns

df_filtered <-df_filtered %>% 
  select(-c(depth_er,magnit_er,azim,horiz_er,
            horiz_dist,magnit_type,magnit_ss,depth_ss,id))

Replace RMS missing values with the mean

df_filtered <- df_filtered %>% 
  mutate(rms=replace(rms, is.na(rms), mean(rms)))

Missing value count

colSums(is.na(df_filtered))
##          date          time           lat           lon          type 
##             0             0             0             0             0 
##         depth        magnit           rms        source    loc_source 
##             0             0          6059             0             0 
## magnit_source        status          year         month           day 
##             0             0             0             0             0

Count number of unique values in each column

rapply(df_filtered, function(x) length(unique(x)))
##          date          time           lat           lon          type 
##         12398         20469         20673         21472             4 
##         depth        magnit           rms        source    loc_source 
##          3485            64           191            13            48 
## magnit_source        status          year         month           day 
##            24             2            52            12            31

Dealing with categorical variables

unique(df_filtered$type)
## [1] "Earthquake"        "Nuclear Explosion" "Explosion"        
## [4] "Rock Burst"
unique(df_filtered$status)
## [1] "Automatic" "Reviewed"
unique(df_filtered$magnit_source)
##  [1] "ISCGEM"   "OFFICIAL" "CI"       "US"       "1020"     "BRK"     
##  [7] "NC"       "1000"     "GCMT"     "1009"     "UW"       "1023"    
## [13] "ATLAS"    "HRV"      "PAR"      "NIED"     "NN"       "SE"      
## [19] "PGC"      "US_GCMT"  "US_PGC"   "AK"       "PR"       "GUC"
unique(df_filtered$loc_source)
##  [1] "ISCGEM" "CI"     "US"     "H"      "U"      "G"      "NC"    
##  [8] "B"      "GCMT"   "AG"     "UW"     "SPE"    "HVO"    "BRK"   
## [15] "ATLAS"  "AGS"    "PGC"    "BOU"    "SLC"    "OTT"    "AEI"   
## [22] "AEIC"   "CASC"   "ISK"    "ATH"    "THE"    "ROM"    "MDD"   
## [29] "WEL"    "GUC"    "UNM"    "CSEM"   "RSPR"   "JMA"    "NN"    
## [36] "CAR"    "SJA"    "TEH"    "BEO"    "UCR"    "SE"     "TUL"   
## [43] "TAP"    "THR"    "LIM"    "US_WEL" "AK"     "PR"
unique(df_filtered$source)
##  [1] "ISCGEM"    "ISCGEMSUP" "OFFICIAL"  "CI"        "US"       
##  [6] "NC"        "GCMT"      "UW"        "ATLAS"     "NN"       
## [11] "SE"        "AK"        "PR"

Store categorical variables as factors

categ_var <- c('type', 'status', 'magnit_source', 'loc_source', 'source')
df_filtered[,categ_var] <- lapply(df_filtered[,categ_var],factor)

Count observations for each type

df_filtered %>% 
  group_by(type) %>% 
  summarise(no_rows = length(type))
## # A tibble: 4 x 2
##   type              no_rows
##   <fct>               <int>
## 1 Earthquake          23229
## 2 Explosion               4
## 3 Nuclear Explosion     175
## 4 Rock Burst              1

There are very few explosions and rock bursts so we can remove those observations.

df_filtered <- df_filtered %>% 
  filter(grepl('Earthquake|Nuclear', type))

We want the variables to contribute equally to the analysis, so we need to make sure that they are on the same scale. So we normalize magnitude, depth and rms

normalize <- function(x){
    return((x - min(x)) / (max(x) - min(x)))
}
df_norm <- df_filtered
df_norm[c('magnit','depth','rms')] <- lapply(df_filtered[c('magnit','depth','rms')], normalize)

Magnitude classes

df_filtered <- df_filtered %>% 
  mutate(class = factor(cut(
     df_filtered$magnit,
     breaks = c(5, 6, 7, 8, Inf),
     labels = c("moderate", "strong", "major", "great"),
     right  = FALSE)
     )
)

Some plots

Histograms for numeric variables

df_filtered %>% 
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value))+
  facet_wrap(~key, scales='free')+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6057 rows containing non-finite values (stat_bin).

Histogram of earthquakes

df_filtered %>% 
  ggplot(aes(x=magnit))+
  geom_histogram(binwidth=0.5)

Count earthquakes for each bin

df_filtered %>% 
  count(cut_width(magnit,0.5))
## # A tibble: 8 x 2
##   `cut_width(magnit, 0.5)`     n
##   <fct>                    <int>
## 1 [5.25,5.75]              11748
## 2 (5.75,6.25]               8009
## 3 (6.25,6.75]               2486
## 4 (6.75,7.25]                816
## 5 (7.25,7.75]                253
## 6 (7.75,8.25]                 79
## 7 (8.25,8.75]                 10
## 8 (8.75,9.25]                  3

Number of earthquakes for each magnitude class

df_filtered %>% 
  group_by(class) %>% 
  summarise(events = n()) %>% 
  ggplot(aes(x = class, y = events))+
  geom_bar(stat = 'identity')

Lets zoom in and focus only on moderate and strong earthquakes

df_filtered %>% 
  filter(magnit < 7) %>% 
  ggplot(aes(x=magnit))+
  geom_histogram(binwidth = 0.1)

df_filtered %>% 
  ggplot(aes(x=depth,colour=class, y=..density..))+
  geom_freqpoly(binwidth=20)

Plot of earthquakes by type

ggplot(data = df_filtered) + 
  borders('world', colour='gray50',fill='gray50') +
  geom_point(aes(x=lon, y=lat, color = type), alpha = 0.3, size = 0.6)

Plot of earthquakes by magnitude

ggplot(data = df_filtered) + 
  borders('world', colour='gray50',fill='gray50') +
  geom_point(aes(x=lon, y=lat, color = class), alpha = 0.3, size = 1)+
  scale_colour_manual(values = c("yellow","red", "blue", "black"))

Plot magnitude by erruption type

ggplot(df_filtered, aes(x = magnit)) +
  geom_density(aes(fill = type), alpha = 0.4)

{r} # ggplot(df_filtered, aes(x = depth, y = magnit)) + # geom_point(aes(colour = type, shape = type), alpha = 0.4) #

Plot number of earthquakes per year

df_filtered %>% 
  filter(type == 'Earthquake') %>% 
  group_by(year) %>% 
  summarise(events = n()) %>% 
  ggplot(aes(x = year, y = events))+
  geom_bar(stat = 'identity') +
  theme_minimal()

Plot average depth based on year

df_filtered %>% 
  filter(type == 'Earthquake') %>% 
  group_by(year) %>% 
  summarise(mean_depth = mean(depth)) %>% 
  ggplot(aes(x = year, y = mean_depth))+
  geom_bar(stat = 'identity') +
  theme_minimal()

Plot average number of earthquakes for every month

df_filtered %>% 
  filter(type == 'Earthquake') %>% 
  group_by(month) %>% 
  summarise(events = n()) %>% 
  ggplot(aes(x = month, y = events))+
  geom_bar(stat = 'identity') +
  theme_minimal()

Plot average depth of earthquake based on month

df_filtered %>% 
  # filter(type == 'Earthquake') %>% 
  group_by(month) %>% 
  summarise(mean_depth = mean(depth)) %>% 
  ggplot(aes(x = month, y = mean_depth))+
  geom_bar(stat = 'identity') +
  theme_minimal()

Animation of earthquakes across years

df_filtered %>% 
  plot_ly(
    x = ~lon,
    y = ~lat,
    frame = ~year,
    type = 'scatter',
    mode = 'markers',
    alpha = 0.3
) %>% 
  layout(xaxis = list(range = c(-200, 200)),
         yaxis = list(range = c(-100, 100)))